-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[OpenMP 6.0 ]Codegen for Reduction over private variables with reduction clause #134709
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[OpenMP 6.0 ]Codegen for Reduction over private variables with reduction clause #134709
Conversation
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-clang Author: CHANDRA GHALE (chandraghale) ChangesCodegen support for reduction over private variable with reduction clause. Section 7.6.10 in in OpenMP 6.0 spec.
Patch is 24.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134709.diff 4 Files Affected:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 06a652c146fb9..3424227e5da79 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4899,6 +4899,150 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF,
}
}
+void CGOpenMPRuntime::emitPrivateReduction(
+ CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
+ ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
+ ArrayRef<const Expr *> ReductionOps) {
+
+ if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty())
+ return;
+
+ if (LHSExprs.size() != Privates.size() ||
+ LHSExprs.size() != ReductionOps.size())
+ return;
+
+ QualType PrivateType = Privates[0]->getType();
+ llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
+
+ BinaryOperatorKind MainBO = BO_Comma;
+ if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[0])) {
+ if (const auto *RHSExpr = BinOp->getRHS()) {
+ if (const auto *BORHS =
+ dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
+ MainBO = BORHS->getOpcode();
+ }
+ }
+ }
+
+ llvm::Constant *InitVal = llvm::Constant::getNullValue(LLVMType);
+ const Expr *Private = Privates[0];
+
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Private)) {
+ if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+ if (const Expr *Init = VD->getInit()) {
+ if (Init->isConstantInitializer(CGF.getContext(), false)) {
+ Expr::EvalResult Result;
+ if (Init->EvaluateAsRValue(Result, CGF.getContext())) {
+ APValue &InitValue = Result.Val;
+ if (InitValue.isInt()) {
+ InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt());
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Create an internal shared variable
+ std::string SharedName = getName({"internal_private_var"});
+ llvm::GlobalVariable *SharedVar = new llvm::GlobalVariable(
+ CGM.getModule(), LLVMType, false, llvm::GlobalValue::CommonLinkage,
+ InitVal, ".omp.reduction." + SharedName, nullptr,
+ llvm::GlobalVariable::NotThreadLocal);
+
+ SharedVar->setAlignment(
+ llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8));
+
+ Address SharedResult(SharedVar, SharedVar->getValueType(),
+ CGF.getContext().getTypeAlignInChars(PrivateType));
+
+ llvm::Value *ThreadId = getThreadID(CGF, Loc);
+ llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
+ llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId};
+
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ llvm::BasicBlock *InitBB = CGF.createBasicBlock("init");
+ llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end");
+
+ llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ(
+ ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0));
+ CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB);
+
+ CGF.EmitBlock(InitBB);
+ CGF.Builder.CreateStore(InitVal, SharedResult);
+ CGF.Builder.CreateBr(InitEndBB);
+
+ CGF.EmitBlock(InitEndBB);
+
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ for (unsigned I = 0; I < ReductionOps.size(); ++I) {
+ if (I >= LHSExprs.size()) {
+ break;
+ }
+
+ const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[I]);
+ if (!BinOp || BinOp->getOpcode() != BO_Assign)
+ continue;
+
+ const Expr *RHSExpr = BinOp->getRHS();
+ if (!RHSExpr)
+ continue;
+
+ BinaryOperatorKind BO = BO_Comma;
+ if (const auto *BORHS =
+ dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
+ BO = BORHS->getOpcode();
+ }
+
+ LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+ LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
+ RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc);
+ auto &&UpdateOp = [&CGF, PrivateRV, BinOp, BO](RValue OldVal) {
+ if (BO == BO_Mul) {
+ llvm::Value *OldScalar = OldVal.getScalarVal();
+ llvm::Value *PrivateScalar = PrivateRV.getScalarVal();
+ llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar);
+ return RValue::get(Result);
+ } else {
+ OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(),
+ BinOp->getLHS()->getType(),
+ ExprValueKind::VK_PRValue);
+ CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal);
+ return CGF.EmitAnyExpr(BinOp->getRHS());
+ }
+ };
+
+ (void)CGF.EmitOMPAtomicSimpleUpdateExpr(
+ SharedLV, PrivateRV, BO, true,
+ llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp);
+ }
+
+ // Final barrier
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ // Broadcast final result
+ llvm::Value *FinalResult = CGF.Builder.CreateLoad(SharedResult);
+
+ // Update private variables with final result
+ for (unsigned I = 0; I < Privates.size(); ++I) {
+ LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
+ CGF.Builder.CreateStore(FinalResult, LHSLV.getAddress());
+ }
+
+ // Final synchronization
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+}
+
void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs,
@@ -5201,6 +5345,9 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
CGF.EmitBranch(DefaultBB);
CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
+ if (Options.IsPrivateVarReduction) {
+ emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps);
+ }
}
/// Generates unique name for artificial threadprivate variables.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 4321712e1521d..50ba28b565b6d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1201,8 +1201,22 @@ class CGOpenMPRuntime {
struct ReductionOptionsTy {
bool WithNowait;
bool SimpleReduction;
+ bool IsPrivateVarReduction;
OpenMPDirectiveKind ReductionKind;
};
+
+ /// Emits code for private variable reduction
+ /// \param Privates List of private copies for original reduction arguments.
+ /// \param LHSExprs List of LHS in \a ReductionOps reduction operations.
+ /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
+ /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
+ /// or 'operator binop(LHS, RHS)'.
+ void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc,
+ ArrayRef<const Expr *> Privates,
+ ArrayRef<const Expr *> LHSExprs,
+ ArrayRef<const Expr *> RHSExprs,
+ ArrayRef<const Expr *> ReductionOps);
+
/// Emit a code for reduction clause. Next code should be emitted for
/// reduction:
/// \code
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index e4d1db264aac9..720a88e075ddd 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1470,6 +1470,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
llvm::SmallVector<const Expr *, 8> LHSExprs;
llvm::SmallVector<const Expr *, 8> RHSExprs;
llvm::SmallVector<const Expr *, 8> ReductionOps;
+ llvm::SmallVector<bool, 8> IsPrivate;
bool HasAtLeastOneReduction = false;
bool IsReductionWithTaskMod = false;
for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
@@ -1480,6 +1481,8 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
Privates.append(C->privates().begin(), C->privates().end());
LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end());
+ IsPrivate.append(C->private_var_reduction_flags().begin(),
+ C->private_var_reduction_flags().end());
ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
IsReductionWithTaskMod =
IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task;
@@ -1499,9 +1502,11 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
bool SimpleReduction = ReductionKind == OMPD_simd;
// Emit nowait reduction if nowait clause is present or directive is a
// parallel directive (it always has implicit barrier).
+ bool IsPrivateVarReduction =
+ llvm::any_of(IsPrivate, [](bool IsPriv) { return IsPriv; });
CGM.getOpenMPRuntime().emitReduction(
*this, D.getEndLoc(), Privates, LHSExprs, RHSExprs, ReductionOps,
- {WithNowait, SimpleReduction, ReductionKind});
+ {WithNowait, SimpleReduction, IsPrivateVarReduction, ReductionKind});
}
}
@@ -3943,7 +3948,8 @@ static void emitScanBasedDirective(
PrivScope.Privatize();
CGF.CGM.getOpenMPRuntime().emitReduction(
CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
- {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown});
+ {/*WithNowait=*/true, /*SimpleReduction=*/true,
+ /*IsPrivateVarReduction */ false, OMPD_unknown});
}
llvm::Value *NextIVal =
CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1));
@@ -5747,7 +5753,7 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) {
}
CGM.getOpenMPRuntime().emitReduction(
*this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
- {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd});
+ {/*WithNowait=*/true, /*SimpleReduction=*/true, false, OMPD_simd});
for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) {
const Expr *PrivateExpr = Privates[I];
LValue DestLVal;
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
new file mode 100644
index 0000000000000..be50991ca193e
--- /dev/null
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -0,0 +1,236 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --prefix-filecheck-ir-name _ --version 5
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+#define N 10
+void do_red(int n, int *v, int &sum_v)
+ {
+ sum_v = 0;
+ #pragma omp for reduction(original(private),+: sum_v)
+ for (int i = 0; i < n; i++)
+ {
+ sum_v += v[i];
+ }
+ }
+ int main(void)
+ {
+ int v[N];
+ for (int i = 0; i < N; i++)
+ v[i] = i;
+ #pragma omp parallel num_threads(4)
+ {
+ int s_v;
+ do_red(N, v, s_v);
+ }
+ return 0;
+ }
+//.
+// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8
+// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.omp.reduction..internal_private_var = common global i32 0, align 4
+// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+//.
+// CHECK-LABEL: define dso_local void @_Z6do_rediPiRi(
+// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[SUM_V4:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]]
+// CHECK: [[OMP_PRECOND_THEN]]:
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-NEXT: store i32 0, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[_TMP5]], align 8
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK-NEXT: br i1 [[CMP7]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK: [[COND_TRUE]]:
+// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: br label %[[COND_END:.*]]
+// CHECK: [[COND_FALSE]]:
+// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK: [[COND_END]]:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE]] ], [ [[TMP11]], %[[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK: [[OMP_INNER_FOR_COND]]:
+// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK-NEXT: br i1 [[CMP8]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CHECK: [[OMP_INNER_FOR_BODY]]:
+// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I6]], align 4
+// CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[I6]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]]
+// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8
+// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]]
+// CHECK-NEXT: store i32 [[ADD9]], ptr [[TMP19]], align 4
+// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK: [[OMP_BODY_CONTINUE]]:
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK: [[OMP_INNER_FOR_INC]]:
+// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1
+// CHECK-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]]
+// CHECK: [[OMP_INNER_FOR_END]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK: [[OMP_LOOP_EXIT]]:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[TMP22]], align 8
+// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: switch i32 [[TMP23]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [
+// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]]
+// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]]
+// CHECK-NEXT: ]
+// CHECK: [[_OMP_REDUCTION_CASE1:.*:]]
+// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK-NEXT: store i32 [[ADD11]], ptr [[TMP7]], align 4
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_CASE2:.*:]]
+// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = atomicrmw add ptr [[TMP7]], i32 [[TMP26]] monotonic, align 4
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]]
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]]
+// CHECK: [[INIT]]:
+// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: br label %[[INIT_END]]
+// CHECK: [[INIT_END]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[T...
[truncated]
|
if (Options.IsPrivateVarReduction) { | ||
emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Drop braces
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done !!
Refactored code to look up the expression and handle UDR, emitting it as is. updated the lit test. |
@alexey-bataev . Reworked on UDR. Will be helpful if you will be able to review further. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to update OpenMPSupport.rst and release notes
@@ -4899,6 +4899,238 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, | |||
} | |||
} | |||
|
|||
void CGOpenMPRuntime::emitPrivateReduction( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- Add the comments, describing the logic of the function.
- How does it differ from the regular reductions codegen? Can you try to unify the logic to reduce the maintenance cost?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- Updated the code with comments and clarified the logic
- Updated OpenMPSupport.rst and the release notes.
- Unified the logic as much as possible.
- In regular reduction codegen, threads update a shared variable directly. With private reduction, each thread first works with its own private copy, and then all these partial results are combined into a shared variable. The final result is copied back to each thread’s original variable
c53e227
to
384cd4a
Compare
@alexey-bataev updated few more test with complex. Any more feedback. |
d234100
to
ac02611
Compare
Addressed release-note rebase problem. |
ac02611
to
4cb9624
Compare
01ca497
to
164f2e8
Compare
ping @alexey-bataev |
164f2e8
to
0d50b0d
Compare
clang/lib/CodeGen/CGStmtOpenMP.cpp
Outdated
IsPrivate.append(C->private_var_reduction_flags().begin(), | ||
C->private_var_reduction_flags().end()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if there is a mix of private and non-private reductions in a single construct? IsPrivateVarReduction flag is set for all reduced value, not matter if they are private or not
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for pointing, I have fixed it now. Mix of private of non-private reduction is now correctly populating.
0d50b0d
to
70b7e90
Compare
llvm::GlobalVariable *SharedVar = new llvm::GlobalVariable( | ||
CGM.getModule(), LLVMType, false, llvm::GlobalValue::InternalLinkage, | ||
llvm::Constant::getNullValue(LLVMType), ".omp.reduction." + SharedName, | ||
nullptr, llvm::GlobalVariable::NotThreadLocal); | ||
|
||
SharedVar->setAlignment( | ||
llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use OMPBuilder.getOrCreateInternalVariable
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done !!!
Address SharedResult(SharedVar, SharedVar->getValueType(), | ||
CGF.getContext().getTypeAlignInChars(PrivateType)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use CGF.MakeNaturalAlignRawAddrLValue
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done !!
} | ||
} else { | ||
CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, | ||
PrivateType.getQualifiers(), true); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PrivateType.getQualifiers(), true); | |
PrivateType.getQualifiers(), /*IsInitializer=*/true); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done !!
70b7e90
to
4e244c3
Compare
clang/lib/CodeGen/CGStmtOpenMP.cpp
Outdated
@@ -5748,7 +5752,8 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) { | |||
} | |||
CGM.getOpenMPRuntime().emitReduction( | |||
*this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, | |||
{/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd}); | |||
{/*WithNowait=*/true, /*SimpleReduction=*/true, | |||
/*IsPrivateVarReduction*/ {false}, OMPD_simd}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/*IsPrivateVarReduction*/ {false}, OMPD_simd}); | |
/*IsPrivateVarReduction=*/{}, OMPD_simd}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done !!
clang/lib/CodeGen/CGStmtOpenMP.cpp
Outdated
@@ -3943,7 +3946,8 @@ static void emitScanBasedDirective( | |||
PrivScope.Privatize(); | |||
CGF.CGM.getOpenMPRuntime().emitReduction( | |||
CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, | |||
{/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown}); | |||
{/*WithNowait=*/true, /*SimpleReduction=*/true, | |||
/*IsPrivateVarReduction*/ {false}, OMPD_unknown}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/*IsPrivateVarReduction*/ {false}, OMPD_unknown}); | |
/*IsPrivateVarReduction=*/{}, OMPD_unknown}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done !!
4e244c3
to
694e241
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG with nits
Codegen support for reduction over private variable with reduction clause. Section 7.6.10 in in OpenMP 6.0 spec.
Sample Test Case from OpenMP 6.0 Example
Expected Codegen: